To begin we will follow the example below. Look carefully at the syntax
library(ggplot2)
library(RColorBrewer)
data(diamonds)
qplot(data=diamonds, x=carat,y=price,color=cut) + scale_color_brewer(palette = 'Accent')
getwd()
## [1] "/Users/brad/Documents/EDA_Data"
setwd('~/Documents/EDA_Data')
statesInfo<-read.csv('stateData.csv')
subset(statesInfo, state.region==1)
## X state.abb state.area state.region population income
## 7 Connecticut CT 5009 1 3100 5348
## 19 Maine ME 33215 1 1058 3694
## 21 Massachusetts MA 8257 1 5814 4755
## 29 New Hampshire NH 9304 1 812 4281
## 30 New Jersey NJ 7836 1 7333 5237
## 32 New York NY 49576 1 18076 4903
## 38 Pennsylvania PA 45333 1 11860 4449
## 39 Rhode Island RI 1214 1 931 4558
## 45 Vermont VT 9609 1 472 3907
## illiteracy life.exp murder highSchoolGrad frost area
## 7 1.1 72.48 3.1 56.0 139 4862
## 19 0.7 70.39 2.7 54.7 161 30920
## 21 1.1 71.83 3.3 58.5 103 7826
## 29 0.7 71.23 3.3 57.6 174 9027
## 30 1.1 70.93 5.2 52.5 115 7521
## 32 1.4 70.55 10.9 52.7 82 47831
## 38 1.0 70.43 6.1 50.2 126 44966
## 39 1.3 71.90 2.4 46.4 127 1049
## 45 0.6 71.64 5.5 57.1 168 9267
For the syntax above we have the following relationship: dataSet[Row condition, column condition]
If left blank, yields all columns
statesInfo[statesInfo$state.region==1,]
## X state.abb state.area state.region population income
## 7 Connecticut CT 5009 1 3100 5348
## 19 Maine ME 33215 1 1058 3694
## 21 Massachusetts MA 8257 1 5814 4755
## 29 New Hampshire NH 9304 1 812 4281
## 30 New Jersey NJ 7836 1 7333 5237
## 32 New York NY 49576 1 18076 4903
## 38 Pennsylvania PA 45333 1 11860 4449
## 39 Rhode Island RI 1214 1 931 4558
## 45 Vermont VT 9609 1 472 3907
## illiteracy life.exp murder highSchoolGrad frost area
## 7 1.1 72.48 3.1 56.0 139 4862
## 19 0.7 70.39 2.7 54.7 161 30920
## 21 1.1 71.83 3.3 58.5 103 7826
## 29 0.7 71.23 3.3 57.6 174 9027
## 30 1.1 70.93 5.2 52.5 115 7521
## 32 1.4 70.55 10.9 52.7 82 47831
## 38 1.0 70.43 6.1 50.2 126 44966
## 39 1.3 71.90 2.4 46.4 127 1049
## 45 0.6 71.64 5.5 57.1 168 9267
Now to find subsets for illerteracy, highschool graduation rate from the statesInfo data set we compute the following
illiteracybracket<-statesInfo[statesInfo$illiteracy==0.5,]
highschoolgradbracket <- statesInfo[statesInfo$highSchoolGrad>50,]
qplot(data=statesInfo, x=state.region, y=illiteracy, color=statesInfo$highSchoolGrad)
We now move onto the following data set # REDDIT DATA SET
reddit<-read.csv("reddit.csv")
str(reddit)
## 'data.frame': 32754 obs. of 14 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ gender : int 0 0 1 0 1 0 0 0 0 0 ...
## $ age.range : Factor w/ 7 levels "18-24","25-34",..: 2 2 1 2 2 2 2 1 3 2 ...
## $ marital.status : Factor w/ 6 levels "Engaged","Forever Alone",..: NA NA NA NA NA 4 3 4 4 3 ...
## $ employment.status: Factor w/ 6 levels "Employed full time",..: 1 1 2 2 1 1 1 4 1 2 ...
## $ military.service : Factor w/ 2 levels "No","Yes": NA NA NA NA NA 1 1 1 1 1 ...
## $ children : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ education : Factor w/ 7 levels "Associate degree",..: 2 2 5 2 2 2 5 2 2 5 ...
## $ country : Factor w/ 439 levels " Canada"," Canada eh",..: 394 394 394 394 394 394 125 394 394 125 ...
## $ state : Factor w/ 53 levels "","Alabama","Alaska",..: 33 33 48 33 6 33 1 6 33 1 ...
## $ income.range : Factor w/ 8 levels "$100,000 - $149,999",..: 2 2 8 2 7 2 NA 7 2 7 ...
## $ fav.reddit : Factor w/ 1834 levels "","___","-","?",..: 720 691 1511 1528 188 691 1318 571 1629 1 ...
## $ dog.cat : Factor w/ 3 levels "I like cats.",..: NA NA NA NA NA 2 2 2 1 1 ...
## $ cheese : Factor w/ 11 levels "American","Brie",..: NA NA NA NA NA 3 3 1 10 7 ...
We call a Factor a variable which contains some categorical information, such as gender, age, etc. For example we can determine employment status as follows
table(reddit$employment.status)
##
## Employed full time
## 14814
## Freelance
## 1948
## Not employed and not looking for work
## 682
## Not employed, but looking for work
## 2087
## Retired
## 85
## Student
## 12987
The above yields a list of emplyomeny status entries.
summary(reddit$employment.status)
## Employed full time
## 14814
## Freelance
## 1948
## Not employed and not looking for work
## 682
## Not employed, but looking for work
## 2087
## Retired
## 85
## Student
## 12987
## NA's
## 151
levels(reddit$age.range)
## [1] "18-24" "25-34" "35-44" "45-54" "55-64"
## [6] "65 or Above" "Under 18"
# notice 7 different ranges
qplot(reddit$age.range)
qplot(data=reddit, x=income.range)
We can rearrange the levels so that the graph is easier to read/more cogent.
reddit$age.range<-ordered(reddit$age.range, levels=c('Under 18', '18-24', '25-34', '35-44', '45-54', '55-64','65 or Above', 'NA'))
qplot(data=reddit, x=age.range)
Now the data is rearranged as required.
Alternatively, we can rearrange the data as follows
reddit$age.range <- factor(reddit$age.range, levels=c('Under 18', '18-24', '25-34', '35-44', '45-54', '55-64','65 or Above', 'NA'), ordered = T)
Similarly, we can do the same for the income brackets.
reddit$income.range<-ordered(reddit$age.range, levels=c("Under $20,000", '$20,000 - $29,999', '$30,000 - $39,999', '$40,000 - $49,999', '$50,000 - $69,999', '$70,000 - $99,999', '$100,000 - $149,999', '$150,000 or more', 'NA'))
qplot(data=reddit, x=income.range)
#Lesson 3 - exploring one variable First I downloaded the pseudo-facebook data, which is found in the
EDA folder We can see the current working directory with the following command
getwd()
## [1] "/Users/brad/Documents/EDA_Data"
To change the directory we use the following command
setwd('~/Documents/EDA_Data')
list.files()
## [1] "energy.csv"
## [2] "energy.xlsx"
## [3] "GDP.csv"
## [4] "GDP.xlsx"
## [5] "HIV.csv"
## [6] "HIV.xlsx"
## [7] "indicator alcohol consumption 20100830.xlsx"
## [8] "indicator_t above 15 unemploy (1).xlsx"
## [9] "indicator_t above 15 unemploy.xlsx"
## [10] "pseudo_facebook.tsv"
## [11] "reddit.csv"
## [12] "stateData.csv"
## [13] "suicide.csv"
## [14] "suicide.xlsx"
## [15] "udacity_workbook_files"
## [16] "udacity_workbook.html"
## [17] "udacity_workbook.R"
## [18] "udacity_workbook.Rmd"
## [19] "unemployment.csv"
pf<-read.csv('pseudo_facebook.tsv', sep='\t')
names(pf)
## [1] "userid" "age"
## [3] "dob_day" "dob_year"
## [5] "dob_month" "gender"
## [7] "tenure" "friend_count"
## [9] "friendships_initiated" "likes"
## [11] "likes_received" "mobile_likes"
## [13] "mobile_likes_received" "www_likes"
## [15] "www_likes_received"
Notice that the pseudo_facebook data is actually a TAB separated file (.tsv). Therefore, we must use the following command.
This data set was created by Udacity, so the stats may not align.
To begin the analysis, we start by looking at birth dates of pseudo-facebook users.
library(ggplot2)
#install.packages('ggthemes', dependencies = TRUE)
library(ggthemes)
names(pf)
## [1] "userid" "age"
## [3] "dob_day" "dob_year"
## [5] "dob_month" "gender"
## [7] "tenure" "friend_count"
## [9] "friendships_initiated" "likes"
## [11] "likes_received" "mobile_likes"
## [13] "mobile_likes_received" "www_likes"
## [15] "www_likes_received"
A histogram of days of the month that individuals were born on is given by the command below
ggplot(aes(x = dob_day), data = pf) +
geom_histogram(binwidth = 1) +
scale_x_continuous(breaks = 1:31)
To motivate what we are actually going ot do with the data consider the following questions: ##Q: Do users perception of their audience actually match their audience. This is an interesting question because it depends on how a user presents themselve. The survey asked: “How many people do you think saw this post?” As the researcher tells us, the gap betweeen expectation and reality was apparent, as people actually underestimated audience size.
Now we begin to move into a discussion of ggplot While using the ggplot enviorment, the + sign indicates a layer of a plot or graph. We can add a variety of layers, such as geom_point() for scatter plots, geom_line for line plots, or nothing for a histogram.
Now we add facet_wrap function to break up the data into months. Also, note that the aes is necessary for the ggplot functions.
ggplot(aes(x = dob_day), data = pf) +
geom_histogram(binwidth = 1) +
scale_x_continuous(breaks = 1:31) +
facet_wrap(~dob_month,ncol=4)
Notice that changing ncol=n, yields a different number of columns, i.e. 3 or 4, etc. Syntax facet_wrap(~variable)
The following code will produce a histogram of the number of the friend count.
qplot(x=friend_count, data=pf)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Furthermore, we can limit the range of the x axis with the following code
qplot(x=friend_count, data=pf, xlim=c(0,1000))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2951 rows containing non-finite values (stat_bin).
Or rather
qplot(x=friend_count, data=pf) +
scale_x_continuous(limit=c(0,1000))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2951 rows containing non-finite values (stat_bin).
Notice the bin width error produced. #adding the following command then yields
qplot(x=friend_count, data=pf, binwidth=25) +
scale_x_continuous(limit=c(0,1000), breaks=seq(0,1000,50))
## Warning: Removed 2951 rows containing non-finite values (stat_bin).
This makes the distribution a bit easier to read.
A: We can see this by computing the following data.
qplot(x=friend_count, data=pf, binwidth=25) +
scale_x_continuous(limit=c(0,1000), breaks=seq(0,1000,50)) +
facet_grid(~gender)
## Warning: Removed 2951 rows containing non-finite values (stat_bin).
In this example, gender is the splitting variable in the
facet_grid.
Notice in the above that we got three columns, since there was some missing data which R interprets as NA. To look at just the two genders, we can run the following code
ggplot(aes(x = friend_count), data = subset(pf, !is.na(gender))) +
geom_histogram() +
scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50)) +
facet_wrap(~gender)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2949 rows containing non-finite values (stat_bin).
Note we caould us omit.na(), too, but it is a bit more problematic since it could remove more than we intended.
Now we want to determine who has more friends on average. To see this run the table command to see if there are more or less men v women.
table(pf$gender)
##
## female male
## 40254 58574
This table tells us that there are female 40254 and male 58574 Moreover, the function below runs the summary function
by(pf$friend_count,pf$gender,summary)
## pf$gender: female
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 37 96 242 244 4923
## --------------------------------------------------------
## pf$gender: male
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 27 74 165 182 4917
The function above applies a function to both the first and second
To begin, consider the following plot
ggplot(aes(x = tenure), data = pf) +
geom_histogram(binwidth = 30, color = 'black', fill = '#099DD9')
## Warning: Removed 2 rows containing non-finite values (stat_bin).
Where
tenure is the number of days that a person has been a member of pseudo-facebook
Rescaling the tenure to years we obtain the following plot
qplot(x=tenure/365, data=pf, binwidth=1/12, color = I('black'), fill = I('#099DD9'), ylab='tenure', xlab='years')+
scale_x_continuous(seq(1,7,1), limits = c(0,7))
## Warning: Removed 26 rows containing non-finite values (stat_bin).
Further adjusting the data we obtain
qplot(x=age, data=pf, binwidth=1, color = I('black'), fill = I('#099DD9')) +
scale_x_continuous(seq(5,110,5), limits = c(5,110))
## Warning: Removed 238 rows containing non-finite values (stat_bin).
Sometimes the data we obtain is very dispersed, i.e. tail heavy.
To see this consider the log transform of the following data. Since the order of magnitude is 10^n, it makes sense to take the log to collect similar data points.
The motivation for doing this is so we can get a normally distributed data set. If the resulting data is normally distributed then we have more statistical tools at out disposal.
qplot(x=friend_count, data=pf)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
summary(pf$friend_count)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 31.0 82.0 196.4 206.0 4923.0
summary(log10(pf$friend_count+1))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.505 1.919 1.868 2.316 3.692
Notice that we added 1 so that the log is well defined
We can also use other transformations, such as the sqrt, or natural log
summary(sqrt(pf$friend_count))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 5.568 9.055 11.088 14.353 70.164
Now try to add all three plots to a single page. To do this, first install the following page called gridExtra, and store the individual plots as follows.
#install.packages('gridExtra')
library(gridExtra)
p1<-qplot(x=friend_count, data=pf)
p2<-qplot(x=log10(friend_count+1), data=pf)
p3<-qplot(x=sqrt(friend_count),data=pf)
grid.arrange(p1,p2,p3)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Alternatively, we can plot this as follows
p1<-ggplot(aes(x=friend_count), data=pf) + geom_histogram()
p2<-p1+scale_x_log10()
p3<-p1+scale_x_sqrt()
grid.arrange(p1,p2,p3)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1962 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
There are some subtle difference in how qplot and ggplot work, so we will compare below.
logScale<-qplot(x=log10(friend_count+1), data=pf)
countScale<-ggplot(aes(x=friend_count), data=pf) + geom_histogram() +scale_x_log10()
grid.arrange(logScale,countScale, ncol = 2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1962 rows containing non-finite values (stat_bin).
Now we will start constructing frequency polygons. In qplot the syntax is:
qplot(x=friend_count, data= subset(pf,!is.na(gender)), binwidth=10, geom='freqpoly', color=gender) +
scale_x_continuous(lim=c(0,1000), breaks=seq(0,1000,50))
## Warning: Removed 2949 rows containing non-finite values (stat_bin).
## Warning: Removed 4 rows containing missing values (geom_path).
Notice that we added
geom and color functions.
In ggplot the syntax is
ggplot(aes(x = friend_count, y = ..count../sum(..count..)),
data = subset(pf, !is.na(gender))) +
geom_freqpoly(aes(color = gender), binwidth=10) +
scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50)) +
xlab('Friend Count') +
ylab('Proportion of users with that friend count')
## Warning: Removed 2949 rows containing non-finite values (stat_bin).
## Warning: Removed 4 rows containing missing values (geom_path).
Next we will consider proportions of friend counts
qplot(x=friend_count, y=..count../sum(..count..),
data= subset(pf,!is.na(gender)),
xlab='Friend Count',
ylab='Proportion of users w/ friend count',
binwidth=10, geom='freqpoly', color=gender) +
scale_x_continuous(lim=c(0,1000), breaks=seq(0,1000,50))
## Warning: Removed 2949 rows containing non-finite values (stat_bin).
## Warning: Removed 4 rows containing missing values (geom_path).
Notice the y axis count is different
Note that we can change limits and breaks
Consider the following plots
qplot(x=www_likes,
data= subset(pf,!is.na(gender)),
geom='freqpoly', color=gender) +
scale_x_continuous()+
scale_x_log10()
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 60935 rows containing non-finite values (stat_bin).
We now numerically analyze the number of
www_likes by gender as follows
by(pf$www_likes,pf$gender, sum)
## pf$gender: female
## [1] 3507665
## --------------------------------------------------------
## pf$gender: male
## [1] 1430175
Consider the following freqpoly. Notice we subset the data to eliminate the na gender entries.
qplot(x=friend_count, data= subset(pf,!is.na(gender)), binwidth=10, geom='freqpoly', color=gender) +
scale_x_continuous(lim=c(0,1000), breaks=seq(0,1000,50))
## Warning: Removed 2949 rows containing non-finite values (stat_bin).
## Warning: Removed 4 rows containing missing values (geom_path).
qplot(geom='boxplot', data=subset(pf,!is.na(gender)),
x=gender,
y=log10(friend_count),
ylim=c(0,1000))
## Warning: Removed 1962 rows containing non-finite values (stat_boxplot).
We now rescale the data so that the number of outliers is minimized, or we can use
qplot(geom='boxplot', data=subset(pf,!is.na(gender)),
x=gender,
y=friend_count)+
scale_y_continuous(limits=c(0,1000))
## Warning: Removed 2949 rows containing non-finite values (stat_boxplot).
Or rather,
qplot(geom='boxplot', data=subset(pf,!is.na(gender)),
x=gender,
y=friend_count)+
coord_cartesian(ylim=c(0,1000))
Observe: notice that the female box has moved closer to 250 in comparison to the other. We can further adjust this as follows:
qplot(geom='boxplot', data=subset(pf,!is.na(gender)),
x=gender,
y=friend_count)+
coord_cartesian(ylim=c(0,250))
Now compute a summary
by(pf$friend_count,pf$gender,summary)
## pf$gender: female
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 37 96 242 244 4923
## --------------------------------------------------------
## pf$gender: male
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 27 74 165 182 4917
Now we begin analyzing friendships initated by gender. Consdier the following plots
qplot(x=gender, y=friendships_initiated,data=subset(pf,!is.na(gender)),geom='boxplot')+
coord_cartesian(ylim=c(0,150))
We now compute a numerical summary.
by(pf$friendships_initiated,pf$gender,summary)
## pf$gender: female
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 19.0 49.0 113.9 124.8 3654.0
## --------------------------------------------------------
## pf$gender: male
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 15.0 44.0 103.1 111.0 4144.0
To check if people use mobile check is use booleans as follows
summary(pf$mobile_likes)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 0.0 4.0 106.1 46.0 25111.0
summary(pf$mobile_likes>0)
## Mode FALSE TRUE
## logical 35056 63947
pf$mobile_check_in<-NA
pf$mobile_check_in<-ifelse(pf$mobile_likes>0,1,0)
pf$mobile_check_in<-factor(pf$mobile_check_in)
summary(pf$mobile_check_in)
## 0 1
## 35056 63947
sum(pf$mobile_check_in==1)/length(pf$mobile_check_in)
## [1] 0.6459097
This tells us that mobile_check_in
To begin we will load the necessary packages and library
library(ggplot2)
data(diamonds)
summary(diamonds)
## carat cut color clarity
## Min. :0.2000 Fair : 1610 D: 6775 SI1 :13065
## 1st Qu.:0.4000 Good : 4906 E: 9797 VS2 :12258
## Median :0.7000 Very Good:12082 F: 9542 SI2 : 9194
## Mean :0.7979 Premium :13791 G:11292 VS1 : 8171
## 3rd Qu.:1.0400 Ideal :21551 H: 8304 VVS2 : 5066
## Max. :5.0100 I: 5422 VVS1 : 3655
## J: 2808 (Other): 2531
## depth table price x
## Min. :43.00 Min. :43.00 Min. : 326 Min. : 0.000
## 1st Qu.:61.00 1st Qu.:56.00 1st Qu.: 950 1st Qu.: 4.710
## Median :61.80 Median :57.00 Median : 2401 Median : 5.700
## Mean :61.75 Mean :57.46 Mean : 3933 Mean : 5.731
## 3rd Qu.:62.50 3rd Qu.:59.00 3rd Qu.: 5324 3rd Qu.: 6.540
## Max. :79.00 Max. :95.00 Max. :18823 Max. :10.740
##
## y z
## Min. : 0.000 Min. : 0.000
## 1st Qu.: 4.720 1st Qu.: 2.910
## Median : 5.710 Median : 3.530
## Mean : 5.735 Mean : 3.539
## 3rd Qu.: 6.540 3rd Qu.: 4.040
## Max. :58.900 Max. :31.800
##
Notice that three of the variables have a well defined order.
Summarizing the data we have
summary(diamonds$price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 326 950 2401 3933 5324 18823
sum(ifelse(diamonds$price>=15000,1,0))
## [1] 1656
ggplot(aes(x = price), data = diamonds) +
geom_histogram(binwidth = 20, color = 'black', fill = '#099DD9')+
scale_x_continuous(limits=c(0,1500))
## Warning: Removed 33930 rows containing non-finite values (stat_bin).
Now we price by cut (this ought to be useful) I’m not sure what else I computed here, but reading through might make things more apparent.
ggplot(aes(x = price),data = diamonds) +
geom_histogram() +
scale_x_continuous() +
facet_wrap(~cut, scales="free_y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
by(diamonds$price,diamonds$cut,summary)
## diamonds$cut: Fair
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 337 2050 3282 4359 5206 18574
## --------------------------------------------------------
## diamonds$cut: Good
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 327 1145 3050 3929 5028 18788
## --------------------------------------------------------
## diamonds$cut: Very Good
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 336 912 2648 3982 5373 18818
## --------------------------------------------------------
## diamonds$cut: Premium
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 326 1046 3185 4584 6296 18823
## --------------------------------------------------------
## diamonds$cut: Ideal
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 326 878 1810 3458 4678 18806
summary(diamonds)
## carat cut color clarity
## Min. :0.2000 Fair : 1610 D: 6775 SI1 :13065
## 1st Qu.:0.4000 Good : 4906 E: 9797 VS2 :12258
## Median :0.7000 Very Good:12082 F: 9542 SI2 : 9194
## Mean :0.7979 Premium :13791 G:11292 VS1 : 8171
## 3rd Qu.:1.0400 Ideal :21551 H: 8304 VVS2 : 5066
## Max. :5.0100 I: 5422 VVS1 : 3655
## J: 2808 (Other): 2531
## depth table price x
## Min. :43.00 Min. :43.00 Min. : 326 Min. : 0.000
## 1st Qu.:61.00 1st Qu.:56.00 1st Qu.: 950 1st Qu.: 4.710
## Median :61.80 Median :57.00 Median : 2401 Median : 5.700
## Mean :61.75 Mean :57.46 Mean : 3933 Mean : 5.731
## 3rd Qu.:62.50 3rd Qu.:59.00 3rd Qu.: 5324 3rd Qu.: 6.540
## Max. :79.00 Max. :95.00 Max. :18823 Max. :10.740
##
## y z
## Min. : 0.000 Min. : 0.000
## 1st Qu.: 4.720 1st Qu.: 2.910
## Median : 5.710 Median : 3.530
## Mean : 5.735 Mean : 3.539
## 3rd Qu.: 6.540 3rd Qu.: 4.040
## Max. :58.900 Max. :31.800
##
ggplot(aes(x = log10(price/carat)),data = diamonds) +
geom_histogram() +
scale_x_continuous() +
facet_wrap(~cut, scales="free_y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
qplot(geom='boxplot', data=diamonds,
x=color,
y=price/carat) +
coord_cartesian( ylim=c(0,7000))
by(diamonds$price,diamonds$color,IQR)
## diamonds$color: D
## [1] 3302.5
## --------------------------------------------------------
## diamonds$color: E
## [1] 3121
## --------------------------------------------------------
## diamonds$color: F
## [1] 3886.25
## --------------------------------------------------------
## diamonds$color: G
## [1] 5117
## --------------------------------------------------------
## diamonds$color: H
## [1] 4996.25
## --------------------------------------------------------
## diamonds$color: I
## [1] 6081.25
## --------------------------------------------------------
## diamonds$color: J
## [1] 5834.5
?diamonds
qplot(x=price, data=diamonds, binwidth=40, geom='freqpoly', color=color)
qplot(data=diamonds,x=carat, geom = 'freqpoly',binwidth=0.1, color=carat)+
scale_x_continuous(breaks = seq(0,3,0.1),lim= c(0,3))+
scale_y_continuous(breaks = seq(2000,10000,1000))
## Warning: Removed 32 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_path).
I’m not sure what I did above!
With the diamonds dataset questions complete, I will now begin analyzing the unemployment dataset I found on mindgapper or whatever database. Switching the working directory again, for sanity
setwd('~/Documents/EDA_Data')
library(tidyr)
gdp <- read.csv("GDP.csv", header=T, check.names = F)
gdp.T <- gather(data=gdp, key='Year', value='GDP', '1960':'2011',convert = TRUE)
range(gdp.T$Year)
## [1] 1960 2011
hiv <- read.csv("HIV.csv", header=T, check.names = F)
hiv.T <- gather(data=hiv, key='Year', value='HIV_prev', '1979':'2011', convert = TRUE)
hiv.T$HIV_prev <- as.numeric(hiv.T$HIV_prev)
summary(hiv.T)
## Estimated HIV Prevalence% - (Ages 15-49) Year HIV_prev
## Abkhazia : 33 Min. :1979 Min. : 0.010
## Afghanistan : 33 1st Qu.:1987 1st Qu.: 0.100
## Akrotiri and Dhekelia: 33 Median :1995 Median : 0.300
## Åland : 33 Mean :1995 Mean : 1.743
## Albania : 33 3rd Qu.:2003 3rd Qu.: 1.200
## Algeria : 33 Max. :2011 Max. :26.500
## (Other) :8877 NA's :5774
gdp.HIV <- merge(gdp.T, hiv.T)
str(gdp.HIV)
## 'data.frame': 2495625 obs. of 5 variables:
## $ Year : int 1979 1979 1979 1979 1979 1979 1979 1979 1979 1979 ...
## $ Income per person (fixed 2000 US$) : Factor w/ 275 levels "Abkhazia","Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ GDP : num NA NA NA NA NA NA NA NA NA NA ...
## $ Estimated HIV Prevalence% - (Ages 15-49): Factor w/ 275 levels "Abkhazia","Afghanistan",..: 1 2 3 5 6 7 8 9 10 12 ...
## $ HIV_prev : num NA NA NA NA NA ...
summary((is.na(gdp.HIV$GDP)) / nrow(gdp.HIV) * 100)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000e+00 0.000e+00 0.000e+00 1.415e-05 4.007e-05 4.007e-05
summary(gdp.HIV$GDP)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 54.5 590.1 2038.9 7315.1 9243.6 108111.2 881100
summary(gdp.HIV$HIV_prev)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0 0.1 0.3 1.7 1.2 26.5 1587850
qplot(y=gdp.HIV$`Income per person (fixed 2000 US$)`,x=gdp.HIV$GDP, data=gdp.HIV, xlim = c(0,4500))
## Warning: Removed 1458325 rows containing missing values (geom_point).
The goal here is to analyze how a country’s GDP and HIV prevalence are related
Loading the relevant library again.
library(ggplot2)
pf<-read.csv('pseudo_facebook.tsv', sep='\t')
I read in the pseudo_facebook for sanity, too.
The code below creates a scatterplot of age (years) vs friend_count
qplot(x=age,y=friend_count,data=pf)
Or identically
qplot(age,friend_count,data=pf)
Several obervations: peaks in the 0-30 range, several peaks ~65-70 (at 69 to be specific) and 100<
From here on we will switch to ggplot syntax, allowing for more robust plots.
ggplot(aes(x=age, y=friend_count),data=pf)+geom_point()+xlim(13,90)
## Warning: Removed 4906 rows containing missing values (geom_point).
Question: computationally, is one of these faster than the other?
Several attributes of ggplot geom=chart type. look at reference. aes wrapper - aesthetic wrapper. The summary of age variable of pf dataset
summary(pf$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 13.00 20.00 28.00 37.28 50.00 113.00
Note that we added the extra x lim layer since we want to cut the data down a little bit. Overplotting= makes it difficult to see how many points are in a certain area. Using the alpha proerty of geom_point() we can adjust the transparency. 1/20 means it takes 20 points to appear as one appoint.
ggplot(aes(x=age, y=friend_count),data=pf)+
geom_point(alpha=1/20)+
xlim(13,90)
## Warning: Removed 4906 rows containing missing values (geom_point).
Now we add jitter, which adds noise to a variable of the data set. In this case, we will add jitter to age(?).
ggplot(aes(x=age, y=friend_count),data=pf)+
geom_jitter(alpha=1/20)+
xlim(13,90)
## Warning: Removed 5189 rows containing missing values (geom_point).
Adding jitter makes sense since we were given that age as an integer, when in reality it ought to be a continous variable.
Now we add a transformation to the y axis, so that we may change the friend count scale.
ggplot(aes(x=age, y=friend_count),data=pf)+
geom_point(alpha=1/20, position=position_jitter(h=0))+
xlim(13,90)+
coord_trans(y = "sqrt")
## Warning: Removed 5178 rows containing missing values (geom_point).
Notice that I had to add the
position_jitter(h=0), so that we do not get negative values. Otherwise, we will get an error. Think of the friend count conditioned on age now.
Exploring the relationship between friends initiated vs age.range
ggplot(aes(x=age, y=friendships_initiated),data=pf)+
geom_point(alpha=1/10, position=position_jitter(h=1))+
xlim(13,90)
## Warning: Removed 5181 rows containing missing values (geom_point).
First we install dplyr library, and group by age to create a new data frame.
#install.packages('dplyr')
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.1
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
age_groups<-group_by(pf,age)
pf.fc_by_age<-summarise(age_groups, friend_count_mean=mean(friend_count), friend_count_median=median(friend_count),n=n())
This yields a new data frame
pf.fc_by_age<-arrange(pf.fc_by_age,age)
This arranges the frame by age in decending order(?)
head(pf.fc_by_age)
## # A tibble: 6 x 4
## age friend_count_mean friend_count_median n
## <int> <dbl> <dbl> <int>
## 1 13 165. 74 484
## 2 14 251. 132 1925
## 3 15 348. 161 2618
## 4 16 352. 172. 3086
## 5 17 350. 156 3283
## 6 18 331. 162 5196
This yields the first few rows of the data frame.
Just use the original data set as follows
pf.fc_by_age<-pf %>%
group_by(age) %>%
summarise(friend_count_mean=mean(friend_count),
friend_count_median=median(friend_count),
n=n()) %>%
arrange(age)
%>% is called the matching operator (?) chains functions onto data set the matching operator is part of the dplyr package
head(pf.fc_by_age)
## # A tibble: 6 x 4
## age friend_count_mean friend_count_median n
## <int> <dbl> <dbl> <int>
## 1 13 165. 74 484
## 2 14 251. 132 1925
## 3 15 348. 161 2618
## 4 16 352. 172. 3086
## 5 17 350. 156 3283
## 6 18 331. 162 5196
Recall the following plot
ggplot(aes(x=friend_count_mean,y=age),data=pf.fc_by_age)+geom_point()
Or with the line geometry
ggplot(aes(y=friend_count_mean,x=age),data=pf.fc_by_age)+geom_line()
USING GGPLOT TO SUMMARISE DATA First we compute the first scatter plot, then use the additional layer geom_line(summary, ....) to
ggplot(aes(x=age,y=friend_count),data=pf)+
xlim(13,90)+
geom_point(alpha=0.05,position=position_jitter(h=0),
color='orange')+
coord_trans(y='sqrt')+
geom_line(stat="summary",fun.y=mean)+
geom_line(stat="summary", fun.y=quantile, fun.args=list(probs=0.1),color='blue',linetype=2)+
geom_line(stat="summary", fun.y=quantile, fun.args=list(probs=0.5),color='blue')+
geom_line(stat="summary", fun.y=quantile, fun.args=list(probs=0.9),color='blue',linetype=2)
## Warning: Removed 4906 rows containing non-finite values (stat_summary).
## Warning: Removed 4906 rows containing non-finite values (stat_summary).
## Warning: Removed 4906 rows containing non-finite values (stat_summary).
## Warning: Removed 4906 rows containing non-finite values (stat_summary).
## Warning: Removed 5191 rows containing missing values (geom_point).
This plots show us that most of the younger users do NOT have more than 2000 friends. dashed lines (
linetype=2) are the 10th, 50th, and 90th percentiles.
Reanalyzing, if we remove the xlim and coodr_trans layers, and replace them with the coord_cart() function we can analyze
ggplot(aes(x=age,y=friend_count),data=pf)+
coord_cartesian(xlim=c(13,70), ylim=c(0,1000))+
geom_point(alpha=0.05,position=position_jitter(h=0),
color='orange')+
geom_line(stat="summary",fun.y=mean,color='red')+
geom_line(stat="summary", fun.y=quantile, fun.args=list(probs=0.1),color='blue',linetype=2)+
geom_line(stat="summary", fun.y=quantile, fun.args=list(probs=0.5),color='blue',linetype=2)+
geom_line(stat="summary", fun.y=quantile, fun.args=list(probs=0.9),color='blue',linetype=2)
INTERPRETATIONS OF THIS PLOT: It is very rare to have more than 800 friends
The cor.test function yields the correlation coefficent with a given confidence level (conf.level).
cor.test(x=pf$age,y=pf$friend_count, method="pearson", conf.level = 0.95)
##
## Pearson's product-moment correlation
##
## data: pf$age and pf$friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.03363072 -0.02118189
## sample estimates:
## cor
## -0.02740737
with(pf,cor.test(age,friend_count,method='pearson'))
##
## Pearson's product-moment correlation
##
## data: age and friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.03363072 -0.02118189
## sample estimates:
## cor
## -0.02740737
Notice that the corellation coefficent is very very small in this particular pair of variables (pf$age,pf$friend_count).
Next we analyze the correlation coeff for subsets of the data so that we can only consider the range 13-70.
Consider the following code
with(subset(pf,age<68),cor.test(age,friend_count,method='pearson'))
##
## Pearson's product-moment correlation
##
## data: age and friend_count
## t = -56.599, df = 89358, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1923557 -0.1796963
## sample estimates:
## cor
## -0.1860337
Note: cor.test uses pearson by default.
Measuring strength of increasing or decreasing, i.e. rank.
ggplot(data = pf,aes(x = www_likes_received, y = likes_received)) +
geom_point(alpha=1/20)+
xlim(0,quantile(pf$www_likes_received,0.95))+
ylim(0,quantile(pf$likes_received,0.95))+
geom_smooth(method = 'lm', color="red")
## Warning: Removed 6075 rows containing non-finite values (stat_smooth).
## Warning: Removed 6075 rows containing missing values (geom_point).
We can check to see how strong the relationship is by using the
with and cor.test functions.
with(pf,cor.test(www_likes_received,likes_received,method='pearson'))
##
## Pearson's product-moment correlation
##
## data: www_likes_received and likes_received
## t = 937.1, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9473553 0.9486176
## sample estimates:
## cor
## 0.9479902
qqplot(x=pf$www_likes_received,y=pf$likes_received)
Notice that the distribution looks approx. normal, so the regression is valid. Also, we are appealing to the WLLN.
Here is a new package to work with
#install.packages('alr3')
library(alr3)
## Loading required package: car
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
data("Mitchell")
With this data set we will create a new scatter plot:
ggplot(data=Mitchell, aes(x=Month,y=Temp))+geom_point()
Notice this doesn’t look like the two variables are even correlated this observation is coroborated by the following computation.
with(Mitchell,cor.test(Month,Temp,method='pearson'))
##
## Pearson's product-moment correlation
##
## data: Month and Temp
## t = 0.81816, df = 202, p-value = 0.4142
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.08053637 0.19331562
## sample estimates:
## cor
## 0.05747063
We want to break up the months into 12 month intervals, which we can do as follows.
ggplot(data=Mitchell, aes(x=Month,y=Temp))+
geom_point()+
scale_x_continuous(breaks=seq(0,203,12))+
geom_line()
Notice that if we stretch the plot, then the plot reveals a sinusoidal pattern. Q: Is there any way to add a curve layer? Yes, add a
geom_line layer as above.
Now we return to the original mean friend count data. Recall the following graph
p1<-ggplot(aes(y=friend_count_mean,x=age),data=subset(pf.fc_by_age,age<71))+
geom_line()+
geom_smooth()
head(pf.fc_by_age,10)
## # A tibble: 10 x 4
## age friend_count_mean friend_count_median n
## <int> <dbl> <dbl> <int>
## 1 13 165. 74 484
## 2 14 251. 132 1925
## 3 15 348. 161 2618
## 4 16 352. 172. 3086
## 5 17 350. 156 3283
## 6 18 331. 162 5196
## 7 19 334. 157 4391
## 8 20 283. 135 3769
## 9 21 236. 121 3671
## 10 22 211. 106 3032
pf.fc_by_age[17:19,]
## # A tibble: 3 x 4
## age friend_count_mean friend_count_median n
## <int> <dbl> <dbl> <int>
## 1 29 121. 66 1936
## 2 30 115. 67.5 1716
## 3 31 118. 63 1694
By scaling the data appropriately, the conditional mean for age by months is given by
pf$age_with_months<-pf$age+(12-pf$dob_month)/12
head(pf$age_with_months)
## [1] 14.08333 14.08333 14.08333 14.00000 14.00000 14.00000
summary(pf$age_with_months)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 13.17 20.92 28.92 37.76 50.33 113.92
Once again, we use dplyr again to chain functions
pf.fc_by_age_months<-pf %>%
group_by(age_with_months) %>%
summarise(friend_count_mean=mean(friend_count),
friend_count_median=median(friend_count),
n=n()) %>%
arrange(age_with_months)
head(pf.fc_by_age_months)
## # A tibble: 6 x 4
## age_with_months friend_count_mean friend_count_median n
## <dbl> <dbl> <dbl> <int>
## 1 13.2 46.3 30.5 6
## 2 13.2 115. 23.5 14
## 3 13.3 136. 44 25
## 4 13.4 164. 72 33
## 5 13.5 131. 66 45
## 6 13.6 157. 64 54
Now we have age with months means, so that we can determine the conditional means
Now I will create a plot of the average friend count with ages measured in months. I will also subset the data so that the we only look at the ages less than 71. This analysis will show us how noise and bin width appear in the data+plots.
p2<-ggplot(aes(y=friend_count_mean,x=age_with_months),data=subset(pf.fc_by_age_months, age_with_months<71))+
geom_line()+
geom_smooth()
library(gridExtra)
grid.arrange(p1,p2,ncol=1)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Notice that when grouped by months there is a lot of noise in the graph. We can filter this data even further. Consider the following filter
p3<-ggplot(aes(y=friend_count,x=round(age/5)*5),data=subset(pf, age<71))+
geom_line(stat = 'summary',fun.y=mean)+
geom_smooth()
grid.arrange(p1,p2,p3,ncol=1)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
Bias-Variance trade off
How do these plots communicate the finding s of this particular analysis?
To begin this lesson, create a scatterplot of price vs x. using the ggplot syntax.
library(ggplot2)
data(diamonds)
ggplot(data=diamonds, aes(x=price,y=x))+
geom_point(alpha=1/10,color='orange')
Now the correlation between price,x,y, and z respectively.
with(diamonds,cor.test(price,x,method='pearson'))
##
## Pearson's product-moment correlation
##
## data: price and x
## t = 440.16, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8825835 0.8862594
## sample estimates:
## cor
## 0.8844352
with(diamonds,cor.test(price,y,method='pearson'))
##
## Pearson's product-moment correlation
##
## data: price and y
## t = 401.14, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8632867 0.8675241
## sample estimates:
## cor
## 0.8654209
with(diamonds,cor.test(price,z,method='pearson'))
##
## Pearson's product-moment correlation
##
## data: price and z
## t = 393.6, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8590541 0.8634131
## sample estimates:
## cor
## 0.8612494
Next up, I’ll create a simple scatter plot of price vs depth.
Doing so yields
ggplot(data=diamonds,aes(x=price,y=depth))+geom_point()
Change the code to make the transparency of the points to be 1/100 of what they are now and mark the x-axis every 2 units. See the instructor notes for two hints.
ggplot(data = diamonds, aes(x = price, y = depth)) +
geom_point(alpha=1/100)+
scale_y_continuous(breaks=seq(0,max(range(diamonds$depth)),2))
We see the range for depth of the diamonds is
range(diamonds$depth)
## [1] 43 79
Now we compute the correlation coeff. between depth vs. price
with(diamonds,cor.test(depth,price,method='pearson'))
##
## Pearson's product-moment correlation
##
## data: depth and price
## t = -2.473, df = 53938, p-value = 0.0134
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.019084756 -0.002208537
## sample estimates:
## cor
## -0.0106474
Create a scatterplot of price vs carat and omit the top 1% of price and carat values.
#ggplot(data=diamonds, aes(x=price,y=carat))+geom_point(alpha=1/100)
ggplot(data=subset(diamonds,price<quantile(price,probs=1/100)),aes(x=price,y=carat))+
geom_point()
Create a scatterplot of price vs. volume (x * y * z).This is a very rough approximation for a diamond’s volume. Create a new variable for volume in the diamonds data frame.This will be useful in a later exercise.
library(ggplot2)
library(RColorBrewer)
data(diamonds)
qplot(data=diamonds, x=carat,y=price,color=cut) + scale_color_brewer(palette = 'Accent')
getwd()
setwd('~/Documents/EDA_Data')
statesInfo<-read.csv('stateData.csv')
subset(statesInfo, state.region==1)
statesInfo[statesInfo$state.region==1,]
illiteracybracket<-statesInfo[statesInfo$illiteracy==0.5,]
highschoolgradbracket <- statesInfo[statesInfo$highSchoolGrad>50,]
qplot(data=statesInfo, x=state.region, y=illiteracy, color=statesInfo$highSchoolGrad)
reddit<-read.csv("reddit.csv")
str(reddit)
table(reddit$employment.status)
summary(reddit$employment.status)
levels(reddit$age.range)
# notice 7 different ranges
qplot(reddit$age.range)
qplot(data=reddit, x=income.range)
reddit$age.range<-ordered(reddit$age.range, levels=c('Under 18', '18-24', '25-34', '35-44', '45-54', '55-64','65 or Above', 'NA'))
qplot(data=reddit, x=age.range)
reddit$age.range <- factor(reddit$age.range, levels=c('Under 18', '18-24', '25-34', '35-44', '45-54', '55-64','65 or Above', 'NA'), ordered = T)
reddit$income.range<-ordered(reddit$age.range, levels=c("Under $20,000", '$20,000 - $29,999', '$30,000 - $39,999', '$40,000 - $49,999', '$50,000 - $69,999', '$70,000 - $99,999', '$100,000 - $149,999', '$150,000 or more', 'NA'))
qplot(data=reddit, x=income.range)
getwd()
setwd('~/Documents/EDA_Data')
list.files()
pf<-read.csv('pseudo_facebook.tsv', sep='\t')
names(pf)
library(ggplot2)
#install.packages('ggthemes', dependencies = TRUE)
library(ggthemes)
names(pf)
ggplot(aes(x = dob_day), data = pf) +
geom_histogram(binwidth = 1) +
scale_x_continuous(breaks = 1:31)
ggplot(aes(x = dob_day), data = pf) +
geom_histogram(binwidth = 1) +
scale_x_continuous(breaks = 1:31) +
facet_wrap(~dob_month,ncol=4)
qplot(x=friend_count, data=pf)
qplot(x=friend_count, data=pf, xlim=c(0,1000))
qplot(x=friend_count, data=pf) +
scale_x_continuous(limit=c(0,1000))
qplot(x=friend_count, data=pf, binwidth=25) +
scale_x_continuous(limit=c(0,1000), breaks=seq(0,1000,50))
qplot(x=friend_count, data=pf, binwidth=25) +
scale_x_continuous(limit=c(0,1000), breaks=seq(0,1000,50)) +
facet_grid(~gender)
ggplot(aes(x = friend_count), data = subset(pf, !is.na(gender))) +
geom_histogram() +
scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50)) +
facet_wrap(~gender)
table(pf$gender)
by(pf$friend_count,pf$gender,summary)
ggplot(aes(x = tenure), data = pf) +
geom_histogram(binwidth = 30, color = 'black', fill = '#099DD9')
qplot(x=tenure/365, data=pf, binwidth=1/12, color = I('black'), fill = I('#099DD9'), ylab='tenure', xlab='years')+
scale_x_continuous(seq(1,7,1), limits = c(0,7))
qplot(x=age, data=pf, binwidth=1, color = I('black'), fill = I('#099DD9')) +
scale_x_continuous(seq(5,110,5), limits = c(5,110))
qplot(x=friend_count, data=pf)
summary(pf$friend_count)
summary(log10(pf$friend_count+1))
summary(sqrt(pf$friend_count))
#install.packages('gridExtra')
library(gridExtra)
p1<-qplot(x=friend_count, data=pf)
p2<-qplot(x=log10(friend_count+1), data=pf)
p3<-qplot(x=sqrt(friend_count),data=pf)
grid.arrange(p1,p2,p3)
p1<-ggplot(aes(x=friend_count), data=pf) + geom_histogram()
p2<-p1+scale_x_log10()
p3<-p1+scale_x_sqrt()
grid.arrange(p1,p2,p3)
logScale<-qplot(x=log10(friend_count+1), data=pf)
countScale<-ggplot(aes(x=friend_count), data=pf) + geom_histogram() +scale_x_log10()
grid.arrange(logScale,countScale, ncol = 2)
qplot(x=friend_count, data= subset(pf,!is.na(gender)), binwidth=10, geom='freqpoly', color=gender) +
scale_x_continuous(lim=c(0,1000), breaks=seq(0,1000,50))
ggplot(aes(x = friend_count, y = ..count../sum(..count..)),
data = subset(pf, !is.na(gender))) +
geom_freqpoly(aes(color = gender), binwidth=10) +
scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50)) +
xlab('Friend Count') +
ylab('Proportion of users with that friend count')
qplot(x=friend_count, y=..count../sum(..count..),
data= subset(pf,!is.na(gender)),
xlab='Friend Count',
ylab='Proportion of users w/ friend count',
binwidth=10, geom='freqpoly', color=gender) +
scale_x_continuous(lim=c(0,1000), breaks=seq(0,1000,50))
qplot(x=www_likes,
data= subset(pf,!is.na(gender)),
geom='freqpoly', color=gender) +
scale_x_continuous()+
scale_x_log10()
by(pf$www_likes,pf$gender, sum)
qplot(x=friend_count, data= subset(pf,!is.na(gender)), binwidth=10, geom='freqpoly', color=gender) +
scale_x_continuous(lim=c(0,1000), breaks=seq(0,1000,50))
qplot(geom='boxplot', data=subset(pf,!is.na(gender)),
x=gender,
y=log10(friend_count),
ylim=c(0,1000))
qplot(geom='boxplot', data=subset(pf,!is.na(gender)),
x=gender,
y=friend_count)+
scale_y_continuous(limits=c(0,1000))
qplot(geom='boxplot', data=subset(pf,!is.na(gender)),
x=gender,
y=friend_count)+
coord_cartesian(ylim=c(0,1000))
qplot(geom='boxplot', data=subset(pf,!is.na(gender)),
x=gender,
y=friend_count)+
coord_cartesian(ylim=c(0,250))
by(pf$friend_count,pf$gender,summary)
qplot(x=gender, y=friendships_initiated,data=subset(pf,!is.na(gender)),geom='boxplot')+
coord_cartesian(ylim=c(0,150))
by(pf$friendships_initiated,pf$gender,summary)
summary(pf$mobile_likes)
summary(pf$mobile_likes>0)
pf$mobile_check_in<-NA
pf$mobile_check_in<-ifelse(pf$mobile_likes>0,1,0)
pf$mobile_check_in<-factor(pf$mobile_check_in)
summary(pf$mobile_check_in)
sum(pf$mobile_check_in==1)/length(pf$mobile_check_in)
library(ggplot2)
data(diamonds)
summary(diamonds)
summary(diamonds$price)
sum(ifelse(diamonds$price>=15000,1,0))
ggplot(aes(x = price), data = diamonds) +
geom_histogram(binwidth = 20, color = 'black', fill = '#099DD9')+
scale_x_continuous(limits=c(0,1500))
ggplot(aes(x = price),data = diamonds) +
geom_histogram() +
scale_x_continuous() +
facet_wrap(~cut, scales="free_y")
by(diamonds$price,diamonds$cut,summary)
summary(diamonds)
ggplot(aes(x = log10(price/carat)),data = diamonds) +
geom_histogram() +
scale_x_continuous() +
facet_wrap(~cut, scales="free_y")
qplot(geom='boxplot', data=diamonds,
x=color,
y=price/carat) +
coord_cartesian( ylim=c(0,7000))
by(diamonds$price,diamonds$color,IQR)
?diamonds
qplot(x=price, data=diamonds, binwidth=40, geom='freqpoly', color=color)
qplot(data=diamonds,x=carat, geom = 'freqpoly',binwidth=0.1, color=carat)+
scale_x_continuous(breaks = seq(0,3,0.1),lim= c(0,3))+
scale_y_continuous(breaks = seq(2000,10000,1000))
setwd('~/Documents/EDA_Data')
library(tidyr)
gdp <- read.csv("GDP.csv", header=T, check.names = F)
gdp.T <- gather(data=gdp, key='Year', value='GDP', '1960':'2011',convert = TRUE)
range(gdp.T$Year)
hiv <- read.csv("HIV.csv", header=T, check.names = F)
hiv.T <- gather(data=hiv, key='Year', value='HIV_prev', '1979':'2011', convert = TRUE)
hiv.T$HIV_prev <- as.numeric(hiv.T$HIV_prev)
summary(hiv.T)
gdp.HIV <- merge(gdp.T, hiv.T)
str(gdp.HIV)
summary((is.na(gdp.HIV$GDP)) / nrow(gdp.HIV) * 100)
summary(gdp.HIV$GDP)
summary(gdp.HIV$HIV_prev)
qplot(y=gdp.HIV$`Income per person (fixed 2000 US$)`,x=gdp.HIV$GDP, data=gdp.HIV, xlim = c(0,4500))
library(ggplot2)
pf<-read.csv('pseudo_facebook.tsv', sep='\t')
qplot(x=age,y=friend_count,data=pf)
qplot(age,friend_count,data=pf)
ggplot(aes(x=age, y=friend_count),data=pf)+geom_point()+xlim(13,90)
summary(pf$age)
ggplot(aes(x=age, y=friend_count),data=pf)+
geom_point(alpha=1/20)+
xlim(13,90)
ggplot(aes(x=age, y=friend_count),data=pf)+
geom_jitter(alpha=1/20)+
xlim(13,90)
ggplot(aes(x=age, y=friend_count),data=pf)+
geom_point(alpha=1/20, position=position_jitter(h=0))+
xlim(13,90)+
coord_trans(y = "sqrt")
ggplot(aes(x=age, y=friendships_initiated),data=pf)+
geom_point(alpha=1/10, position=position_jitter(h=1))+
xlim(13,90)
#install.packages('dplyr')
library(dplyr)
age_groups<-group_by(pf,age)
pf.fc_by_age<-summarise(age_groups, friend_count_mean=mean(friend_count), friend_count_median=median(friend_count),n=n())
pf.fc_by_age<-arrange(pf.fc_by_age,age)
head(pf.fc_by_age)
pf.fc_by_age<-pf %>%
group_by(age) %>%
summarise(friend_count_mean=mean(friend_count),
friend_count_median=median(friend_count),
n=n()) %>%
arrange(age)
head(pf.fc_by_age)
ggplot(aes(x=friend_count_mean,y=age),data=pf.fc_by_age)+geom_point()
ggplot(aes(y=friend_count_mean,x=age),data=pf.fc_by_age)+geom_line()
ggplot(aes(x=age,y=friend_count),data=pf)+
xlim(13,90)+
geom_point(alpha=0.05,position=position_jitter(h=0),
color='orange')+
coord_trans(y='sqrt')+
geom_line(stat="summary",fun.y=mean)+
geom_line(stat="summary", fun.y=quantile, fun.args=list(probs=0.1),color='blue',linetype=2)+
geom_line(stat="summary", fun.y=quantile, fun.args=list(probs=0.5),color='blue')+
geom_line(stat="summary", fun.y=quantile, fun.args=list(probs=0.9),color='blue',linetype=2)
ggplot(aes(x=age,y=friend_count),data=pf)+
coord_cartesian(xlim=c(13,70), ylim=c(0,1000))+
geom_point(alpha=0.05,position=position_jitter(h=0),
color='orange')+
geom_line(stat="summary",fun.y=mean,color='red')+
geom_line(stat="summary", fun.y=quantile, fun.args=list(probs=0.1),color='blue',linetype=2)+
geom_line(stat="summary", fun.y=quantile, fun.args=list(probs=0.5),color='blue',linetype=2)+
geom_line(stat="summary", fun.y=quantile, fun.args=list(probs=0.9),color='blue',linetype=2)
cor.test(x=pf$age,y=pf$friend_count, method="pearson", conf.level = 0.95)
with(pf,cor.test(age,friend_count,method='pearson'))
with(subset(pf,age<68),cor.test(age,friend_count,method='pearson'))
ggplot(data = pf,aes(x = www_likes_received, y = likes_received)) +
geom_point(alpha=1/20)+
xlim(0,quantile(pf$www_likes_received,0.95))+
ylim(0,quantile(pf$likes_received,0.95))+
geom_smooth(method = 'lm', color="red")
with(pf,cor.test(www_likes_received,likes_received,method='pearson'))
qqplot(x=pf$www_likes_received,y=pf$likes_received)
#install.packages('alr3')
library(alr3)
data("Mitchell")
ggplot(data=Mitchell, aes(x=Month,y=Temp))+geom_point()
with(Mitchell,cor.test(Month,Temp,method='pearson'))
ggplot(data=Mitchell, aes(x=Month,y=Temp))+
geom_point()+
scale_x_continuous(breaks=seq(0,203,12))+
geom_line()
p1<-ggplot(aes(y=friend_count_mean,x=age),data=subset(pf.fc_by_age,age<71))+
geom_line()+
geom_smooth()
head(pf.fc_by_age,10)
pf.fc_by_age[17:19,]
pf$age_with_months<-pf$age+(12-pf$dob_month)/12
head(pf$age_with_months)
summary(pf$age_with_months)
pf.fc_by_age_months<-pf %>%
group_by(age_with_months) %>%
summarise(friend_count_mean=mean(friend_count),
friend_count_median=median(friend_count),
n=n()) %>%
arrange(age_with_months)
head(pf.fc_by_age_months)
p2<-ggplot(aes(y=friend_count_mean,x=age_with_months),data=subset(pf.fc_by_age_months, age_with_months<71))+
geom_line()+
geom_smooth()
library(gridExtra)
grid.arrange(p1,p2,ncol=1)
p3<-ggplot(aes(y=friend_count,x=round(age/5)*5),data=subset(pf, age<71))+
geom_line(stat = 'summary',fun.y=mean)+
geom_smooth()
grid.arrange(p1,p2,p3,ncol=1)
library(ggplot2)
data(diamonds)
ggplot(data=diamonds, aes(x=price,y=x))+
geom_point(alpha=1/10,color='orange')
with(diamonds,cor.test(price,x,method='pearson'))
with(diamonds,cor.test(price,y,method='pearson'))
with(diamonds,cor.test(price,z,method='pearson'))
ggplot(data=diamonds,aes(x=price,y=depth))+geom_point()
ggplot(data = diamonds, aes(x = price, y = depth)) +
geom_point(alpha=1/100)+
scale_y_continuous(breaks=seq(0,max(range(diamonds$depth)),2))
range(diamonds$depth)
with(diamonds,cor.test(depth,price,method='pearson'))
#ggplot(data=diamonds, aes(x=price,y=carat))+geom_point(alpha=1/100)
ggplot(data=subset(diamonds,price<quantile(price,probs=1/100)),aes(x=price,y=carat))+
geom_point()